Intro

I play a lot of chess. Like … a lot of chess. From July 2021-January of 2022 I played over 1600 games on Lichess. For this project I’m analyzing all of my games played on Lichess using the chessR package by Jason Zivkovic. This package is an R wrapper for the Lichess API, making it extremely easy to import data from my games into the R environment.

# devtools::install_github("JaseZiv/chessR")
library(chessR)
library(dplyr)
library(ggplot2)
library(ggridges)
library(plotly)
library(lubridate)

Now I’ll retrieve all of my game data from Lichess

lichess_data <- get_raw_lichess('Eldiel_Prime')
## Extracting  Eldiel_Prime  games. Please wait

Cleaning

I’ll start by using the chessR helper functions num_moves() and get_winner() to determine who won each game, and how many moves were played in each game respectively. Then I’ll perform my own analysis to determine stats like my elo and the opponents elo of each game, which color I played as, determining my result, and converting to PST date-time.

#helper functions from chessR
lichess_data$num_moves <- return_num_moves(moves_string = lichess_data$Moves)
lichess_data$winner <- get_winner(result_column = lichess_data$Result,
                                  white = lichess_data$White,
                                  black = lichess_data$Black)
#change dates to date class
lichess_data$Date <- ymd(lichess_data$Date)

#day of week
lichess_data$day_of_week <- wday(lichess_data$Date, label = TRUE)

#determine which color I played as
lichess_data$my_color <- NA
lichess_data$my_color[lichess_data$White == "eldiel_prime"] <- "White"
lichess_data$my_color[lichess_data$Black == "eldiel_prime"] <- "Black"

#determine my elo and opponent's elo
lichess_data$my_elo <- NA
lichess_data$opp_elo <- NA

lichess_data$my_elo[lichess_data$White == "eldiel_prime"] <- lichess_data$WhiteElo[lichess_data$White == "eldiel_prime"]
lichess_data$my_elo[lichess_data$Black == "eldiel_prime"] <- lichess_data$BlackElo[lichess_data$Black == "eldiel_prime"]

lichess_data$opp_elo[lichess_data$White == "eldiel_prime"] <- lichess_data$BlackElo[lichess_data$White == "eldiel_prime"]
lichess_data$opp_elo[lichess_data$Black == "eldiel_prime"] <- lichess_data$WhiteElo[lichess_data$Black == "eldiel_prime"]

#rename time controls
lichess_data$TimeControl[lichess_data$TimeControl == '300+0'] <- '5+0'
lichess_data$TimeControl[lichess_data$TimeControl == '600+0'] <- '10+0'
lichess_data$TimeControl[lichess_data$TimeControl == '60+0'] <- '1+0'
lichess_data$TimeControl[lichess_data$TimeControl == '180+2'] <- '3+2'
lichess_data$TimeControl[lichess_data$TimeControl == '120+1'] <- '2+1'
lichess_data$TimeControl[lichess_data$TimeControl == '180+0'] <- '3+0'

#determine my result, win loss or draw
lichess_data$my_result <- 'loss'
lichess_data$my_result[lichess_data$winner == 'eldiel_prime'] <- 'win'
lichess_data$my_result[lichess_data$winner == 'Draw'] <- 'draw'

#make factor to assign custom ordering for later use
lichess_data$my_result <- factor(lichess_data$my_result,
                                 levels = c('loss','draw','win'))

#drop rows where opp elo is '?'
lichess_data <- lichess_data[lichess_data$opp_elo != '?',]

#ensure elos are numeric
lichess_data$opp_elo <- as.numeric(lichess_data$opp_elo)
lichess_data$my_elo <- as.numeric(lichess_data$my_elo)

#generate opening counts by color
lichess_data <- lichess_data %>% 
  group_by(Opening,my_color) %>%
  mutate(opening_count_by_color = n())

#generate daily counts of game type
lichess_data <- lichess_data %>% 
  group_by(Date, Event) %>%
  mutate(daily_type_count = n())

#create date-times and convert to local PST
lichess_data$date_time_UTC <- paste(lichess_data$UTCDate, lichess_data$UTCTime)
lichess_data$date_time_UTC <- as_datetime(lichess_data$date_time_UTC)

#now convert to PST
lichess_data$date_time_PST <- format(lichess_data$date_time_UTC,
                                     usetz=TRUE,
                                     tz = 'America/Los_Angeles')

#extract the hour value of each observation in PST
lichess_data$hour <- format(as.POSIXct(lichess_data$date_time_PST),
                            format = "%H")   

Next, I’ll clean the opening names. Chess openings have lots of variations, but I’m not interested in the variations when looking at how many times I’ve played each opening (as some openings can have many dozens of variations). Let’s see how many variations I’ve encountered in my favorite opening: The Queen’s Gambit.

unique(lichess_data$Opening[grepl("Queen's Gambit",lichess_data$Opening,
                   fixed = TRUE)])
##  [1] "Queen's Gambit Accepted: Old Variation"                                      
##  [2] "Queen's Gambit Declined: Exchange Variation, Positional Variation"           
##  [3] "Queen's Gambit Declined: Marshall Defense"                                   
##  [4] "Queen's Gambit Declined: Queen's Knight Variation"                           
##  [5] "Queen's Gambit Declined: Modern Variation, Normal Line"                      
##  [6] "Queen's Gambit Declined: Vienna Variation, Quiet Variation"                  
##  [7] "Queen's Gambit"                                                              
##  [8] "Queen's Gambit Declined: Traditional Variation"                              
##  [9] "Queen's Gambit Declined"                                                     
## [10] "Queen's Gambit Declined: Baltic Defense"                                     
## [11] "Queen's Gambit Declined: Harrwitz Attack, Orthodox Defense"                  
## [12] "Queen's Gambit Declined: Chigorin Defense, Exchange Variation"               
## [13] "Queen's Gambit Declined: Modern Variation, Knight Defense"                   
## [14] "Queen's Gambit Declined: Albin Countergambit, Fianchetto Variation, Bg4 Line"
## [15] "Queen's Gambit Declined: Exchange Variation"                                 
## [16] "Queen's Gambit Declined: Modern Variation"                                   
## [17] "Queen's Gambit Declined: Austrian Defense"                                   
## [18] "Queen's Gambit Declined: Normal Defense"                                     
## [19] "Queen's Gambit Declined: Pseudo-Tarrasch Variation"                          
## [20] "Queen's Gambit Declined: Cambridge Springs Defense, Capablanca Variation"    
## [21] "Queen's Gambit Declined: Albin Countergambit, Fianchetto Variation"          
## [22] "Queen's Gambit Declined: Lasker Defense"                                     
## [23] "Queen's Gambit Declined: Orthodox Defense, Botvinnik Variation"              
## [24] "Queen's Gambit Declined: Albin Countergambit"                                
## [25] "Queen's Gambit Declined: Janowski Variation"                                 
## [26] "Queen's Gambit Declined: Albin Countergambit, Normal Line"                   
## [27] "Queen's Gambit Accepted: Normal Variation, Traditional System"               
## [28] "Queen's Gambit Declined: Cambridge Springs Defense"                          
## [29] "Queen's Gambit Declined: Semi-Tarrasch Defense"                              
## [30] "Queen's Gambit Declined: Harrwitz Attack"                                    
## [31] "Queen's Gambit Declined: Neo-Orthodox Variation, Main Line"                  
## [32] "Queen's Gambit Declined: Charousek Variation"                                
## [33] "Queen's Gambit Declined: Chigorin Defense"                                   
## [34] "Queen's Gambit Declined: Semi-Tarrasch Defense, Exchange Variation"          
## [35] "Queen's Gambit Declined: Three Knights Variation"                            
## [36] "Queen's Gambit Declined: Marshall Defense, Tan Gambit"

That’s 36 variations just in a single opening. For my analysis, I want all of the variations to be classified just as the base opening. Here I’ll standardize the opening names a little bit.

lichess_data$Opening[grepl("Caro-Kann",lichess_data$Opening,
                   fixed = TRUE)] <- "Caro-Kann Defense"

lichess_data$Opening[grepl("Queen's Gambit",lichess_data$Opening,
                           fixed = TRUE)] <- "Queen's Gambit"

lichess_data$Opening[grepl("King's Indian",lichess_data$Opening,
                           fixed = TRUE)] <- "King's Indian Defense"

lichess_data$Opening[grepl("Nimzo-Indian Defense",lichess_data$Opening,
                   fixed = TRUE)] <- "Nimzo-Indian Defense"

lichess_data$Opening[grepl("Queen's Pawn Game",lichess_data$Opening,
                   fixed = TRUE)] <- "Queen's Pawn Game"

lichess_data$Opening[grepl("English Opening",lichess_data$Opening,
                   fixed = TRUE)] <- "English Opening"

lichess_data$Opening[grepl("Englund Gambit",lichess_data$Opening,
                   fixed = TRUE)] <- "Englund Gambit"

lichess_data$Opening[grepl("French Defense",lichess_data$Opening,
                   fixed = TRUE)] <- "French Defense"

lichess_data$Opening[grepl("Sicilian Defense",lichess_data$Opening,
                   fixed = TRUE)] <- "Sicilian Defense"

lichess_data$Opening[grepl("Scandinavian Defense",lichess_data$Opening,
                   fixed = TRUE)] <- "Scandinavian Defense"

lichess_data$Opening[grepl("Benoni Defense",lichess_data$Opening,
                   fixed = TRUE)] <- "Benoni Defense"

lichess_data$Opening[grepl("Vienna Game",lichess_data$Opening,
                   fixed = TRUE)] <- "Vienna Game"

#turn anything with "Semi-Slav" into "Semi-Slav" 
#before running checks for "Slav Defense" !!!

lichess_data$Opening[grepl("Semi-Slav",lichess_data$Opening,
                   fixed = TRUE)] <- "Semi-Slav"

lichess_data$Opening[grepl("Slav Defense",lichess_data$Opening,
                   fixed = TRUE)] <- "Slav Defense"

lichess_data$Opening[grepl("Modern Defense",lichess_data$Opening,
                   fixed = TRUE)] <- "Modern Defense"

lichess_data$Opening[grepl("Dutch Defense",lichess_data$Opening,
                   fixed = TRUE)] <- "Dutch Defense"

lichess_data$Opening[grepl("Budapest Defense",lichess_data$Opening,
                   fixed = TRUE)] <- "Budapest Defense"

#generate total opening counts
lichess_data <- lichess_data %>% 
  group_by(Opening) %>%
  mutate(opening_count = n())

I also want to do some analysis on specific moves in my games.

#split 'moves' into individual components
moves<- suppressWarnings(gsub("\\{.*?\\}", "", lichess_data$Moves,
                              perl = TRUE) %>% strsplit(., "\\s+"))

#most common first move
first_moves <- c()
for(i in moves){
  first_moves <- append(first_moves,i[2])
}

#make df of first moves, my color
moves_df <- data.frame(first_moves, lichess_data$my_color)
colnames(moves_df) <- c("first_moves","my_color")

## Special moves
#how many queenside castles?
num_queenside_castle <- c()
for(i in moves){
  queenside_castle <- i[grepl("O-O-O",i,fixed = TRUE)]
  num_queenside_castle <- append(num_queenside_castle, queenside_castle)
}
num_queenside_castle <- length(num_queenside_castle)

#how many castles?
num_castle <- c()
for(i in moves){
  castle <- i[grepl("O-O",i,fixed = TRUE)]
  num_castle <- append(num_castle, castle)
}
#subtract queenside bc "O-O" grepl also picks up "O-O-O"
num_castle <- length(num_castle) - num_queenside_castle

#checkmates
num_mate <- c()
for(i in moves){
  mate <- i[grepl("#",i,fixed = TRUE)]
  num_mate <- append(num_mate, mate)
}
num_mate <- length(num_mate)

#checks
num_check <- c()
for(i in moves){
  check <- i[grepl("+",i,fixed = TRUE)]
  num_check <- append(num_check, check)
}
num_check <- length(num_check)

#promotions
num_promotion <- c()
for(i in moves){
  promotion <- i[grepl("=",i,fixed = TRUE)]
  num_promotion <- append(num_promotion, promotion)
}
num_promotion <- length(num_promotion)

##Move counts by piece type
#how many king moves?
num_king <- c()
for(i in moves){
  king <- i[grepl("K",i,fixed = TRUE)]
  num_king <- append(num_king, king)
}
num_king <- length(num_king)
#add castling / queenside castling, as these are considered king moves
num_king <- num_king + num_castle +num_queenside_castle

#how many queen moves?
num_queen <- c()
for(i in moves){
  queen <- i[grepl("Q",i,fixed = TRUE)]
  num_queen <- append(num_queen, queen)
}
num_queen <- length(num_queen)

#how many rook moves?
num_rook <- c()
for(i in moves){
  rook <- i[grepl("R",i,fixed = TRUE)]
  num_rook <- append(num_rook, rook)
}
num_rook <- length(num_rook)

#how many bishop moves?
num_bishop <- c()
for(i in moves){
  bishop <- i[grepl("B",i,fixed = TRUE)]
  num_bishop <- append(num_bishop, bishop)
}
num_bishop <- length(num_bishop)

#how many knight moves?
num_knight <- c()
for(i in moves){
  knight <- i[grepl("N",i,fixed = TRUE)]
  num_knight <- append(num_knight, knight)
}
num_knight <- length(num_knight)

#make df of piece,move count
#not including pawn bc there are 8 of them and 2/1 of pieces
piece_type_df <- data.frame(
  'piece' = c('King','Queen','Rook','Bishop','Knight'),
  'move_count' = c(num_king, num_queen, num_rook, num_bishop, num_knight))

Visualization

Now that I’ve imported and cleaned my data, I’ll visualize some findings. I’ve organized these plots into categories for more cohesive insights.

Time Control

Which time controls do I play the most?

lichess_data %>% group_by(TimeControl) %>%
  count(TimeControl) %>% 
  ggplot(aes(x= reorder(TimeControl,n), y= n, fill = TimeControl)) +
  geom_col(alpha = 1) +
  geom_text(aes(label = n), hjust = -0.3) +
  coord_flip() +
  ylim(0,1380) +
  labs(x= "Time Control", y= "Number of Games") +
  ggtitle("Most Played Time Controls") +
  theme_bw() +
  theme(legend.position="none")

What is my win/loss/draw percentage in each time control?

#drop 10+0 because it's all casual games against low elo

ggplotly(
  lichess_data[lichess_data$TimeControl %in%
               c("1+0","2+1","3+0","3+2","5+0"),] %>% 
  group_by(TimeControl,my_result) %>% 
  count(TimeControl) %>%
  ggplot(aes(fill = my_result, y = n, x = TimeControl))+
  geom_bar(position = "fill",stat = "identity")+
  labs(y = "Result %", x = NULL, fill = "My Result") +
  ggtitle("Win/Draw/Loss Percentage by Time Control") +
  theme_bw())  %>% config(displayModeBar = F)

Openings

My most played openings (limited to openings I’ve played 12 or more times).

ggplotly(
  lichess_data[lichess_data$opening_count >= 12,] %>% 
  group_by(Opening,my_color,opening_count) %>% 
  count(Opening) %>%
  ggplot(aes(x= n, y=reorder(Opening, opening_count), fill = my_color)) +
  geom_col(position = "stack",stat = 'identity') +
  labs(x= "Number of Games", y= NULL, fill = "My Color") +
  ggtitle("Most Frequent Openings") + 
  scale_fill_manual(values=c("sienna4", "wheat1")) +
  theme_bw()) %>% config(displayModeBar = F)

How do I perform in each opening?

ggplotly(
  lichess_data[lichess_data$opening_count >= 12,] %>% 
  group_by(Opening,my_result,opening_count) %>% 
  count(Opening) %>%
  ggplot(aes(fill = my_result, y = n, x = reorder(Opening,opening_count)))+
  geom_bar(position = "fill",stat = "identity")+
  labs(y = "Result %", x = NULL, fill = "My Result") +
  coord_flip() + 
  ggtitle("Win/Draw/Loss by Opening") +
  theme_bw()) %>% config(displayModeBar = F)

It seems that I perform well in my two most played openings (Caro Kann and Queen’s Gambit). However, I clearly need to rethink (or better study) The Vienna Game, as I’m losing 59% of my games in this opening, and it’s one I’m playing with the white pieces. Other notable insights are that I perform very strongly agaisnt the modern defense, and very poorly against the Budapest Defense.

Most common first move?

ggplotly(
  moves_df %>% group_by(first_moves,my_color) %>% 
  count(first_moves) %>%
  ggplot(aes(x= n, y=reorder(first_moves, n), fill = my_color)) +
  geom_col(position = "stack",stat = 'identity') +
  labs(x= "Number of Games", y= NULL, fill = "My Color") +
  ggtitle("Most Frequent First Moves") + 
  scale_fill_manual(values=c("sienna4", "wheat1")) +
  theme_bw()) %>% config(displayModeBar = F)

Move Count and Special Moves

How do I perform in as the number of moves in a game increases?

ggplot(lichess_data, aes(x = num_moves, fill = my_result))+
  geom_density(alpha = 0.7) + theme_bw() +
  xlim(0,120) + 
  labs(x = "Number of Moves", fill = 'My Result') +
  ggtitle("Performance by Move Count")

Interestingly, I perform better in games with a higher number of moves. This is likely due to the fact that if I make blunders, I resign early and move on to the next game. The distribution of draws in longer games also makes sense, as draws often come from endgames with high move counts.

How often do special move types occur?

special_df <- data.frame('move_type' = c('queenside castle','castle','check',
                                         'checkmate','promotion'),
                         'count' = c(num_queenside_castle, num_castle, num_check,
                                     num_mate, num_promotion))
#now plot
special_df %>%
  ggplot(aes(x= reorder(move_type,-count), y= count, fill = move_type)) +
  geom_bar(stat= 'identity', alpha = 1) +
  geom_text(aes(label = count), vjust = -0.4) +
  labs(y = "Count", x = NULL) + 
  ylim(0,6500) +
  ggtitle("Counts of Special Moves") +
  theme_bw() +
  theme(legend.position="none")

Elo

How has my elo changed over time?

ggplotly(
  lichess_data[lichess_data$Event %in% 
               c("Rated Blitz game","Rated Bullet game"),] %>%
ggplot(aes(x = Date, y = my_elo, color = Event)) +
  geom_step() +
  ylim(1600,2100) + 
  labs(y = "Elo", x = NULL) +
  ggtitle("Elo Over Time") +
  theme_bw()) %>% config(displayModeBar = F)

How do I perform against different elos?

ggplot(lichess_data, aes(x = opp_elo, fill = my_result))+
  geom_density(alpha = 0.7) + theme_bw() +
  xlim(1600,2100) + 
  xlab("Opponent Elo") +
  ggtitle("Performance by Opponent Elo")

Playing Habits

How many games am I playing per day?

p <- ggplot(data = lichess_data[lichess_data$Date >= "2021-7-1", ], 
       aes(x = Date, y = daily_type_count, color = Event)) +
  geom_step() +
  ylim(0,31) + 
  labs(y = "Number of Games", x = NULL) +
  ggtitle("Daily Games Played")+
  theme_bw()

ggplotly(p)

During which day(s) of the week do I play the most?

lichess_data %>% group_by(day_of_week) %>%
  count(day_of_week) %>%
  ggplot(aes(x = day_of_week, y = n, fill = day_of_week)) +
  geom_bar(stat= 'identity', alpha = 1) +
  geom_text(aes(label = n), vjust = -0.4) +
  labs(x = NULL, y = "Games Played") +
  ggtitle("Total Games per Weekday") +
  theme_bw() +
  theme(legend.position="none")

When during the day do I usually play?

lichess_data %>% group_by(hour) %>%
  count(hour) %>%
  ggplot(aes(x = hour, y = n, fill = hour)) +
  geom_bar(stat= 'identity', alpha = 1) +
  geom_text(aes(label = n), vjust = -0.4) +
  ylim(0,185) +
  labs(x = NULL, y = "Games Played") +
  ggtitle("Hourly Game Distribution") +
  theme_bw() +
  theme(legend.position="none")

Pieces

Which pieces get moved the most?

piece_type_df %>%
  ggplot(aes(x= reorder(piece,-move_count), y= move_count, fill = piece)) +
  geom_bar(stat= 'identity', alpha = 1) +
  geom_text(aes(label = move_count), vjust = -0.4) +
  ylim(0,18000) +
  labs(y = "Move Count", x = NULL) +
  theme_bw() +
  theme(legend.position="none")

These orders make sense. Firstly, each player has two of the Knight, Bishop and Rook, os it makes sense that these are moved more than the King and Queen (of which each player only has one). It also makes sense that the knight is the most moved. As a short range piece, the knight needs a lot of moves and a lot of re-routes to get to the action. The Bishops and Rooks are much ‘faster’ pieces which do not have limits on how many squares they can travel each turn, meaning they don’t need as many moves to travel to the action.

Squares

For this section, I’ll actually briefly return to some data wrangling. I want to generate counts for the number of times a piece moves to each square on the board. This can be accomplished with the following loop:

values <- c() #for storing counts
files <- c("a","b","c","d","e","f","g","h")
ranks <- c("1","2","3","4","5","6","7","8")

file_list <- c() #for storing each iteration file
rank_list <- c() #for storing each iteration rank

#loop through each file,rank (to get a square)
for(i in files){
  for(j in ranks){
    #we now have a selected square with coordinates i,j
    count_temp <- c()
    for(k in moves){
      temp <- k[grepl(paste(i,j,sep = ""),k,fixed = TRUE)]
      count_temp <- append(count_temp,temp)
      }
    count_temp <- length(count_temp)
    values <- append(values, count_temp) #store counts
    file_list <- append(file_list, i) #append file
    rank_list <- append(rank_list, j) #append rank
  }
}

#combine into dataframe
grid <- data.frame(file = file_list,
                   rank = as.numeric(rank_list),
                   count = values)

#glimpse at dataframe
grid[1:4,]
##   file rank count
## 1    a    1   344
## 2    a    2   372
## 3    a    3  1034
## 4    a    4  1114

Now that I have a dataframe of counts for each square, I’ll create a heatmap.

#heatmap
  ggplot(grid, aes(x = file, y = rank, fill = count)) +
  geom_tile(color = "white",
            lwd = 0.5,
            linetype = 1) +
  geom_text(aes(label = paste(file,rank, sep = '')), 
            color = "white", size = 2.5, vjust = -1) +
  geom_text(aes(label = count), color = "white", size = 2.5, vjust = 1) +
  coord_fixed() +
  scale_fill_gradient(low = 'blue4',
                      high = 'red1') +
  guides(fill = guide_colourbar(barwidth = 0.5,
                                barheight = 10)) +
  theme_void() #for getting rid of the axes

Most of the action is happening in the center, as it should! The next most popular squares are f3, c3, f6, and c6, where the knights are usually developed to.